{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 疑难杂症：sklearn中的pipeline报错问题"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "scikit-learn 0.20.3，在python3.5以后都会报错"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-danger alertdanger\" style=\"margin-top: 20px\">\n",
    "\n",
    "TypeError: 'Pipeline' object is not subscriptable\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![img_0201](.\\img\\0101.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "原因\n",
    "\n",
    "[changelog](https://scikit-learn.org/0.21/whats_new.html#sklearn-pipeline)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![img_0201](.\\img\\0102.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "解决办法，升级相关库的版本\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "!pip install --upgrade scikit-learn\n",
    "\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "#### 版本声明\n",
    "\n",
    "|名称|版本|简介|\n",
    "|----|----|----|\n",
    "|$python$|$3.7.3$|编程语言|\n",
    "|$scikit-learn$|$0.22.1$|机器学习|\n",
    "|$numpy$|$1.18.1$|数组运算|\n",
    "|$scipy$|$1.4.1$|数学运算|\n",
    "|$joblib$|$0.14.1$|不清楚|"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 例子"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入几个库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:26.420599Z",
     "start_time": "2020-01-20T02:53:25.851070Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn import svm\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_regression\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:28.067359Z",
     "start_time": "2020-01-20T02:53:28.053375Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[ 0.61118028,  0.07396296, -0.49596905, ..., -0.51753365,\n",
       "         -0.37339927, -0.70521074],\n",
       "        [-0.55470506, -1.26634051, -1.03437283, ..., -0.05798395,\n",
       "          0.07377011,  0.60247721],\n",
       "        [ 0.72456704, -0.22624522,  1.28626861, ...,  1.06456868,\n",
       "         -0.45374431,  0.44663973],\n",
       "        ...,\n",
       "        [ 1.25561121,  0.40561759,  1.5316888 , ...,  0.71500701,\n",
       "          0.48056211,  0.40041203],\n",
       "        [ 1.72707396, -0.00827807,  1.20562808, ...,  0.69476103,\n",
       "          1.3238748 ,  0.93299664],\n",
       "        [-0.47240735, -0.03014427,  1.7691167 , ..., -0.56770578,\n",
       "          0.28012139,  0.3905229 ]]),\n",
       " array([0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n",
       "        1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,\n",
       "        0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,\n",
       "        1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,\n",
       "        0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1]))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# generate some data to play with\n",
    "X, y = make_classification(n_informative=5, n_redundant=0, random_state=42)\n",
    "X, y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "生成两种算法实例"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:28.482247Z",
     "start_time": "2020-01-20T02:53:28.478258Z"
    }
   },
   "outputs": [],
   "source": [
    "# ANOVA SVM-C\n",
    "anova_filter = SelectKBest(f_regression, k=5)\n",
    "clf = svm.SVC(kernel='linear')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:29.015371Z",
     "start_time": "2020-01-20T02:53:29.010396Z"
    }
   },
   "outputs": [],
   "source": [
    "anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过参数名设置参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:29.402133Z",
     "start_time": "2020-01-20T02:53:29.390619Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "         steps=[('anova',\n",
       "                 SelectKBest(k=10,\n",
       "                             score_func=<function f_regression at 0x000002451623F1E0>)),\n",
       "                ('svc',\n",
       "                 SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None,\n",
       "                     coef0=0.0, decision_function_shape='ovr', degree=3,\n",
       "                     gamma='scale', kernel='linear', max_iter=-1,\n",
       "                     probability=False, random_state=None, shrinking=True,\n",
       "                     tol=0.001, verbose=False))],\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# You can set the parameters using the names issued\n",
    "# For instance, fit using a k of 10 in the SelectKBest\n",
    "# and a parameter 'C' of the svm\n",
    "anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:29.583613Z",
     "start_time": "2020-01-20T02:53:29.573672Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.83"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prediction = anova_svm.predict(X)\n",
    "anova_svm.score(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:29.783081Z",
     "start_time": "2020-01-20T02:53:29.771113Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "         steps=[('anova',\n",
       "                 SelectKBest(k=10,\n",
       "                             score_func=<function f_regression at 0x000002451623F1E0>)),\n",
       "                ('svc',\n",
       "                 SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None,\n",
       "                     coef0=0.0, decision_function_shape='ovr', degree=3,\n",
       "                     gamma='scale', kernel='linear', max_iter=-1,\n",
       "                     probability=False, random_state=None, shrinking=True,\n",
       "                     tol=0.001, verbose=False))],\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "anova_svm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:30.090546Z",
     "start_time": "2020-01-20T02:53:30.083564Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.feature_selection._univariate_selection.SelectKBest"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(anova_svm['anova'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "报错，似乎svm用不了索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:35.530164Z",
     "start_time": "2020-01-20T02:53:35.523182Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False,  True,  True, False, False,  True,  True, False,\n",
       "        True, False,  True,  True, False,  True, False,  True,  True,\n",
       "       False, False])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# getting the selected features chosen by anova_filter\n",
    "anova_svm['anova'].get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:35.735152Z",
     "start_time": "2020-01-20T02:53:35.728170Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False,  True,  True, False, False,  True,  True, False,\n",
       "        True, False,  True,  True, False,  True, False,  True,  True,\n",
       "       False, False])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Another way to get selected features chosen by anova_filter\n",
    "anova_svm.named_steps.anova.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:35.914971Z",
     "start_time": "2020-01-20T02:53:35.904030Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "         steps=[('anova',\n",
       "                 SelectKBest(k=10,\n",
       "                             score_func=<function f_regression at 0x000002451623F1E0>))],\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Indexing can also be used to extract a sub-pipeline.\n",
    "sub_pipeline = anova_svm[:1]\n",
    "sub_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:36.069559Z",
     "start_time": "2020-01-20T02:53:36.065567Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "coef = anova_svm[-1].coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:36.215169Z",
     "start_time": "2020-01-20T02:53:36.207189Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "anova_svm['svc'] is anova_svm[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:36.357792Z",
     "start_time": "2020-01-20T02:53:36.350805Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 10)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coef.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:36.537305Z",
     "start_time": "2020-01-20T02:53:36.531322Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1, 20)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub_pipeline.inverse_transform(coef).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-20T02:53:36.771679Z",
     "start_time": "2020-01-20T02:53:36.765695Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys \n",
    "sys.version"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 完整代码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-19T04:25:29.792265Z",
     "start_time": "2020-01-19T04:25:29.760353Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn import svm\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_regression\n",
    "from sklearn.pipeline import Pipeline\n",
    "# generate some data to play with\n",
    "X, y = make_classification(\n",
    "     n_informative=5, n_redundant=0, random_state=42)\n",
    "#ANOVA SVM-C\n",
    "anova_filter = SelectKBest(f_regression, k=5)\n",
    "clf = svm.SVC(kernel='linear')\n",
    "anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])\n",
    "#You can set the parameters using the names issued\n",
    "#For instance, fit using a k of 10 in the SelectKBest\n",
    "#and a parameter 'C' of the svm\n",
    "anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)\n",
    "#Pipeline(steps=[('anova', SelectKBest(...)), ('svc', SVC(...))])\n",
    "prediction = anova_svm.predict(X)\n",
    "print(anova_svm.score(X, y))\n",
    "# getting the selected features chosen by anova_filter\n",
    "print(anova_svm['anova'].get_support())\n",
    "# Another way to get selected features chosen by anova_filter\n",
    "print(anova_svm.named_steps.anova.get_support())\n",
    "# Indexing can also be used to extract a sub-pipeline.\n",
    "sub_pipeline = anova_svm[:1]\n",
    "print(sub_pipeline)\n",
    "Pipeline(steps=[('anova', SelectKBest(...))])\n",
    "coef = anova_svm[-1].coef_\n",
    "print(anova_svm['svc'] is anova_svm[-1])\n",
    "print(coef.shape)\n",
    "print(sub_pipeline.inverse_transform(coef).shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-01-19T04:59:16.113198Z",
     "start_time": "2020-01-19T04:58:59.595015Z"
    }
   },
   "outputs": [],
   "source": [
    "!pip install --upgrade scikit-learn"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
